suppressPackageStartupMessages({
library(tidyverse)
library(MOFA2)
library(Matrix)
library(SingleCellExperiment)
library(scran)
library(glue)
library(scater)
library(patchwork)
library(batchelor)
library(rhdf5)
library(ggraph)
}
)
Define plotting utils
remove_x_axis <- function(){
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
}
remove_y_axis <- function(){
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())
}
org_colors <- read_csv("~/Pan_fetal_immune/metadata/organ_colors.csv")
Missing column names filled in: 'X1' [1]Error in (function (srcref) : unimplemented type (29) in 'eval'
figdir <- "~/mount/gdrive/Pan_fetal/Updates_and_presentations/figures/MOFA_analysis/"
if (!dir.exists(figdir)){ dir.create(figdir) }
indir <- glue("/nfs/team205/ed6/data/Fetal_immune/LMM_data/LMM_input_{split}_PBULK/")
Error: glue cannot interpolate functions into strings.
* object 'split' is a function.
## Plot number of cells per organ/celltype pair
n_cells_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_cells=sum(n_cells)) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=log10(n_cells))) +
geom_text(aes(label=n_cells), color="white") +
scale_fill_viridis_c() +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_samples_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_samples=n()) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=n_samples)) +
geom_text(aes(label=n_samples), color="white") +
scale_fill_viridis_c(option="cividis") +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_cells_heatmap / n_samples_heatmap
# Exclude celltypes present in just one organ
keep_ct <- data.frame(colData(sce)) %>%
select(organ, anno_lvl_2_final_clean) %>%
distinct() %>%
group_by(anno_lvl_2_final_clean) %>%
summarise(n=n()) %>%
ungroup() %>%
filter(n > 1) %>%
pull(anno_lvl_2_final_clean)
Error in (function (classes, fdef, mtable) :
unable to find an inherited method for function 'select' for signature '"data.frame"'
## Plot number of cells per organ/celltype pair
n_cells_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_cells=sum(n_cells)) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=log10(n_cells))) +
geom_text(aes(label=n_cells), color="white") +
scale_fill_viridis_c() +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_samples_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_samples=n()) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=n_samples)) +
geom_text(aes(label=n_samples), color="white") +
scale_fill_viridis_c(option="cividis") +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_cells_heatmap / n_samples_heatmap
## Feature selection w scran WITHIN CELLTYPE
anno_groups <- split(colnames(sce), sce$anno_lvl_2_final_clean)
all_hvgs <- c()
for (i in anno_groups){
dec <- modelGeneVar(sce[,i])
hvgs <- getTopHVGs(dec, n = 1000)
all_hvgs <- union(all_hvgs, hvgs)
}
sce <- sce[which(rowSums(logcounts(sce)) > 0),]
sce
class: SingleCellExperiment
dim: 28260 993
metadata(0):
assays(1): logcounts
rownames(28260): TSPAN6 TNMD ... AP000646.1 AP006216.3
rowData names(0):
colnames(993): F45_SK_CD45P_FCAImmP7579224-F45-SK-CD4+T-12-5GEX F45_SK_CD45P_FCAImmP7579224-F45-SK-CD8+T-12-5GEX ...
F50_SP_CD45P_FCAImmP7803020-F50-SP-IMMATURE_B-15-5GEX F30_TH_CD45N_FCAImmP7277565-F30-TH-ABT(ENTRY)-14-3GEX
colData names(7): Sample donor ... method n_cells
reducedDimNames(0):
altExpNames(0):
EDA with PCA
sce <- runPCA(sce, scale=TRUE, ncomponents=30,
exprs_values = "logcounts", subset_row=all_hvgs)
# ## Variance explained
# percent.var <- attr(reducedDim(sce), "percentVar")
# plot(percent.var, log="y", xlab="PC", ylab="Variance explained (%)")
plotPCA(sce, colour_by="donor", ncomponents=6)
plotPCA(sce, colour_by="method", ncomponents=6)
plotPCA(sce, colour_by="organ", ncomponents=10)
Minimize obvious technical effects (3GEX/5GEX) using linear regression (following procedure from OSCA)
## Regress technical effects
design <- model.matrix(~donor+method,data=colData(sce))
residuals <- regressBatches(sce, assay.type = "logcounts", design = design)
assay(sce, "corrected_logcounts") <- as.matrix(assay(residuals[,colnames(sce)], "corrected"))
## Regress organ (soup effect)
design <- model.matrix(~organ,data=colData(sce)) ## Include organ term to capture soup
residuals <- regressBatches(sce, assay.type = "corrected_logcounts", design = design)
assay(sce, "corrected_logcounts") <- as.matrix(assay(residuals[,colnames(sce)], "corrected"))
Check regression has an effect repeating PCA
sce <- runPCA(sce, scale=TRUE, ncomponents=30, exprs_values = "corrected_logcounts")
plotPCA(sce, colour_by="method", ncomponents=6)
plotPCA(sce, colour_by="donor", ncomponents=6)
plotPCA(sce, colour_by="organ", ncomponents=8)
## Feature selection w scran WITHIN CELLTYPE
anno_groups <- split(colnames(sce), sce$anno_lvl_2_final_clean)
all_hvgs <- c()
for (i in anno_groups){
dec <- modelGeneVar(sce[,i], assay.type = "corrected_logcounts")
hvgs <- getTopHVGs(dec, n = 1000)
all_hvgs <- union(all_hvgs, hvgs)
}
Make MOFA object (Use celltypes as grouping covariate)
mofa_obj <- readRDS(glue('{indir}LYMPHOID_mofa_obj_organCorrected.RDS'))
Loading required package: MOFA2
Attaching package: ‘MOFA2’
The following object is masked from ‘package:stats’:
predict
object <- mofa_obj
Prepare 4 training
object
Untrained MOFA model with the following characteristics:
Number of views: 1
Views names: corrected_logcounts
Number of features (per view): 7300
Number of groups: 23
Groups names: ABT(ENTRY) B1 CD4+T CD8+T CD8AA CYCLING_MPP CYCLING_NK CYCLING_T HSC_MPP ILC3 IMMATURE_B LARGE_PRE_B LATE_PRO_B LMPP_ELP MATURE_B MEMP NK NK_T PRE_PRO_B PRO_B SMALL_PRE_B TH17 TREG
Number of samples (per group): 26 32 63 55 24 28 61 38 32 62 29 54 32 10 55 26 87 52 40 50 46 43 48
Wrapped in run_mofa.R
# install.packages(renv)
# renv::init()
renv::install("reticulate")
renv::use_python()
py_pkgs <- c(
"scanpy",
"anndata",
"mofapy2"
)
reticulate::py_install(py_pkgs)
mofa_trained <- run_mofa(object, outfile = outfile)
Warning: Output file /nfs/team205/ed6/data/Fetal_immune/LMM_data/LMM_input_LYMPHOID_PBULK/LYMPHOID_mofa_model_oneview_organCorrected.hdf5 already exists, it will be replaced
Connecting to the mofapy2 python package using reticulate (use_basilisk = FALSE)...
Please make sure to manually specify the right python binary when loading R with reticulate::use_python(..., force=TRUE) or the right conda environment with reticulate::use_condaenv(..., force=TRUE)
If you prefer to let us automatically install a conda environment with 'mofapy2' installed using the 'basilisk' package, please use the argument 'use_basilisk = TRUE'
#########################################################
### __ __ ____ ______ ###
### | \/ |/ __ \| ____/\ _ ###
### | \ / | | | | |__ / \ _| |_ ###
### | |\/| | | | | __/ /\ \_ _| ###
### | | | | |__| | | / ____ \|_| ###
### |_| |_|\____/|_|/_/ \_\ ###
### ###
#########################################################
Successfully loaded view='corrected_logcounts' group='ABT(ENTRY)' with N=26 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='B1' with N=32 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CD4+T' with N=63 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CD8+T' with N=55 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CD8AA' with N=24 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CYCLING_MPP' with N=28 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CYCLING_NK' with N=61 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='CYCLING_T' with N=38 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='HSC_MPP' with N=32 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='ILC3' with N=62 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='IMMATURE_B' with N=29 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='LARGE_PRE_B' with N=54 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='LATE_PRO_B' with N=32 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='LMPP_ELP' with N=10 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='MATURE_B' with N=55 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='MEMP' with N=26 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='NK' with N=87 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='NK_T' with N=52 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='PRE_PRO_B' with N=40 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='PRO_B' with N=50 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='SMALL_PRE_B' with N=46 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='TH17' with N=43 samples and D=7300 features...
Successfully loaded view='corrected_logcounts' group='TREG' with N=48 samples and D=7300 features...
WARNING: 'ard_factors' in model_options should be set to True if using multiple groups unless you are using MEFISTO
Model options:
- Automatic Relevance Determination prior on the factors: False
- Automatic Relevance Determination prior on the weights: True
- Spike-and-slab prior on the factors: False
- Spike-and-slab prior on the weights: True
Likelihoods:
- View 0 (corrected_logcounts): gaussian
Warning: some group(s) have less than 15 samples, MOFA won't be able to learn meaningful factors for these group(s)...
######################################
## Training the model with seed 2020 ##
######################################
ELBO before training: -128754892.30
Iteration 1: time=12.72, ELBO=1546417.23, deltaELBO=130301309.528 (101.20105512%), Factors=30
Iteration 2: time=12.40, ELBO=2808114.37, deltaELBO=1261697.141 (0.97992171%), Factors=30
Iteration 3: time=12.55, ELBO=3165428.61, deltaELBO=357314.241 (0.27751508%), Factors=30
Iteration 4: time=12.64, ELBO=3248002.54, deltaELBO=82573.933 (0.06413266%), Factors=30
Iteration 5: time=12.77, ELBO=3275153.02, deltaELBO=27150.480 (0.02108695%), Factors=30
Iteration 6: time=12.74, ELBO=3292546.06, deltaELBO=17393.044 (0.01350865%), Factors=30
Iteration 7: time=12.60, ELBO=3304528.49, deltaELBO=11982.426 (0.00930639%), Factors=30
Iteration 8: time=12.40, ELBO=3314496.32, deltaELBO=9967.833 (0.00774171%), Factors=30
Iteration 9: time=12.42, ELBO=3323682.59, deltaELBO=9186.263 (0.00713469%), Factors=30
Iteration 10: time=12.42, ELBO=3332226.53, deltaELBO=8543.946 (0.00663582%), Factors=30
Iteration 11: time=12.37, ELBO=3339946.37, deltaELBO=7719.842 (0.00599577%), Factors=30
Iteration 12: time=12.34, ELBO=3346980.33, deltaELBO=7033.956 (0.00546306%), Factors=30
Iteration 13: time=12.38, ELBO=3353713.28, deltaELBO=6732.948 (0.00522928%), Factors=30
Iteration 14: time=12.39, ELBO=3360238.09, deltaELBO=6524.816 (0.00506763%), Factors=30
Iteration 15: time=12.44, ELBO=3366188.23, deltaELBO=5950.139 (0.00462129%), Factors=30
Iteration 16: time=12.51, ELBO=3371048.61, deltaELBO=4860.380 (0.00377491%), Factors=30
Iteration 17: time=12.89, ELBO=3374570.98, deltaELBO=3522.370 (0.00273572%), Factors=30
Iteration 18: time=12.55, ELBO=3376839.12, deltaELBO=2268.136 (0.00176159%), Factors=30
Iteration 19: time=12.73, ELBO=3378192.86, deltaELBO=1353.743 (0.00105141%), Factors=30
Iteration 20: time=12.52, ELBO=3378991.95, deltaELBO=799.091 (0.00062063%), Factors=30
Iteration 21: time=13.01, ELBO=3379480.81, deltaELBO=488.855 (0.00037968%), Factors=30
Iteration 22: time=13.31, ELBO=3379799.31, deltaELBO=318.506 (0.00024737%), Factors=30
Iteration 23: time=12.88, ELBO=3380022.73, deltaELBO=223.413 (0.00017352%), Factors=30
Iteration 24: time=12.63, ELBO=3380190.96, deltaELBO=168.236 (0.00013066%), Factors=30
Iteration 25: time=12.34, ELBO=3380325.28, deltaELBO=134.315 (0.00010432%), Factors=30
Iteration 26: time=12.58, ELBO=3380438.01, deltaELBO=112.735 (0.00008756%), Factors=30
Iteration 27: time=12.79, ELBO=3380536.80, deltaELBO=98.786 (0.00007672%), Factors=30
Iteration 28: time=12.90, ELBO=3380626.07, deltaELBO=89.271 (0.00006933%), Factors=30
Iteration 29: time=12.76, ELBO=3380708.35, deltaELBO=82.280 (0.00006390%), Factors=30
Iteration 30: time=11.58, ELBO=3380785.55, deltaELBO=77.198 (0.00005996%), Factors=30
Iteration 31: time=11.44, ELBO=3380859.00, deltaELBO=73.448 (0.00005704%), Factors=30
Iteration 32: time=11.31, ELBO=3380929.41, deltaELBO=70.411 (0.00005469%), Factors=30
Iteration 33: time=11.48, ELBO=3380997.29, deltaELBO=67.881 (0.00005272%), Factors=30
Iteration 34: time=11.42, ELBO=3381063.10, deltaELBO=65.813 (0.00005112%), Factors=30
Iteration 35: time=11.60, ELBO=3381127.15, deltaELBO=64.045 (0.00004974%), Factors=30
Iteration 36: time=12.96, ELBO=3381189.61, deltaELBO=62.460 (0.00004851%), Factors=30
Converged!
#######################
## Training finished ##
#######################
Warning: Output file /nfs/team205/ed6/data/Fetal_immune/LMM_data/LMM_input_LYMPHOID_PBULK/LYMPHOID_mofa_model_oneview_organCorrected.hdf5 already exists, it will be replaced
Saving model in /nfs/team205/ed6/data/Fetal_immune/LMM_data/LMM_input_LYMPHOID_PBULK/LYMPHOID_mofa_model_oneview_organCorrected.hdf5...
23 factors were found to explain little or no variance and they were removed for downstream analysis. You can disable this option by setting load_model(..., remove_inactive_factors = F)
Error in .quality_control(object, verbose = verbose) :
!duplicated(unlist(samples_names(object))) are not all TRUE
# Remove inactive factors
if (remove_inactive_factors) {
r2 <- rowSums(do.call('cbind', lapply(object@cache[["variance_explained"]]$r2_per_factor, rowSums, na.rm=TRUE)))
var.threshold <- 0.0001
if (all(r2 < var.threshold)) {
warning(sprintf("All %s factors were found to explain little or no variance so remove_inactive_factors option has been disabled.", length(r2)))
} else if (any(r2 < var.threshold)) {
object <- subset_factors(object, which(r2>=var.threshold), recalculate_variance_explained=FALSE)
message(sprintf("%s factors were found to explain no variance and they were removed for downstream analysis. You can disable this option by setting load_model(..., remove_inactive_factors = FALSE)", sum(r2 < var.threshold)))
}
}
Error in subset_factors(object, which(r2 >= var.threshold), recalculate_variance_explained = FALSE) :
unused argument (recalculate_variance_explained = FALSE)
plot_variance_explained(mofa_trained, x='factor', y='group', split_by = 'view', plot_total = TRUE, max_r2 = 50)[[1]] +
theme(axis.text.x = element_text(angle=45, hjust=1))
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[2]] %>%
ggplot(aes(group, value)) +
geom_col() +
coord_flip() +
ylab("Var. (%)") +
theme_classic(base_size=14)
Plot by celltype
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[1]] %>%
ggplot(aes(factor, value)) + geom_col() +
coord_flip() +
facet_wrap(group~., ncol = 6, scales = "free_x")
plot_factor_cor(mofa_trained, method = "spearman")
## Correlation with principal components
pcs <- reducedDim(sce)
fctrs <- get_factors(mofa_trained) %>%
purrr::reduce(rbind)
corrplot::corrplot(cor(pcs, fctrs[rownames(pcs),]))
plot_factor_ordered <- function(mofa_trained, f){
factor_df <- get_factors(mofa_trained, factors = f, as.data.frame = TRUE) %>%
mutate(organ = sapply(str_split(sample, "_"), function(x) x[2])) %>%
group_by(group) %>%
mutate(gr_mean = median(value)) %>%
ungroup() %>%
arrange(gr_mean) %>%
mutate(group=factor(group, levels=unique(group)))
r2_df <- get_variance_explained(mofa_trained, factors = f, as.data.frame = TRUE)[[1]] %>%
filter(factor==paste0('Factor',f)) %>%
mutate(group=factor(group, levels = levels(factor_df$group)))
pl1 <- factor_df %>%
ggplot(aes(group, value)) +
geom_boxplot() +
geom_jitter(aes(color= organ), size=0.7) +
geom_hline(yintercept = 0, linetype=2) +
coord_flip() +
ylab(paste0("Factor ", f)) +
theme_bw(base_size = 14)
pl2 <- r2_df %>%
ggplot(aes(group, value)) +
geom_col() +
coord_flip() +
ylab("% variance explained") +
theme_bw(base_size = 14) +
remove_y_axis()
pl1 + pl2 + plot_layout(widths=c(2,1), guides="collect")
}
get_top_celltype_per_factor <- function(mofa_trained, f){
r2_df <- get_variance_explained(mofa_trained, factors = f, as.data.frame = TRUE)[[1]] %>%
filter(factor==paste0('Factor',f))
# mutate(group=factor(group, levels = ))
top_quant_r2 <- quantile(r2_df$value, probs = seq(0, 1, by = 0.2))["80%"]
top_groups <- r2_df$group[r2_df$value >= top_quant_r2]
return(top_groups)
}
save_factor_id <- function(mofa_trained, f, figdir){
## Order celltypes by factor values
p1 <- plot_factor_ordered(mofa_trained, f)
## Plot factor values across organs for celltypes with high variance explained
p2 <- plot_factor(mofa_trained, factors = f, groups = get_top_celltype_per_factor(mofa_trained, f), group_by = "group",
color_by = "organ",
dot_size = 2, dodge = TRUE
)
## Plot factor weights on genes
# plot_data_heatmap(mofa_trained, factor = f, nfeatures = 50, text_size = 3, show_colnames=FALSE,
# annotation_samples = c("organ", "time", "method", "donor"))
p3 <- plot_weights(mofa_trained, factors = f, nfeatures = 30, text_size = 3) +
scale_y_discrete(expand=c(0.1, 0.1))
(p1 | (p2 / p3)) +
plot_layout(guides="collect") +
ggsave(glue("{figdir}/MOFA_{split}_factorID_factor{f}.pdf"), width = 15, height = 10)
}
for (f in 1:mofa_trained@dimensions$K){
print(paste0("Saving ID for Factor ", f, "..."))
save_factor_id(mofa_trained, f=f, figdir = figdir)
}
# save_factor_id(mofa_trained, f=1, figdir = figdir)
# plot_weights(mofa_trained, factors = f, nfeatures = 30, text_size = 3) +
# scale_y_discrete(expand=c(0.1, 0.1))
## Get factors that explain most variance in each celltype
get_top_factor_per_celltype <- function(mofa_trained, gr, min_R2=2){
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[1]] %>%
filter(group==gr) %>%
filter(value >= min_R2) %>%
pull(factor) %>%
as.character()
}
## Make KNN graph based on similarity of top factors for each celltype
get_ct_KNN_graph <- function(mofa_trained, gr, min_R2=5, k=5){
## Get factors that explain most variance per celltype
fs <- get_top_factor_per_celltype(mofa_trained, gr, min_R2 = min_R2)
## Make KNN graph from top factors
Z <- get_factors(mofa_trained, groups=gr, factors = fs)[[1]]
knn_ct <- buildKNNGraph(t(Z), k=k)
## Add attributes
metadata_ct <- samples_metadata(mofa_trained)[rownames(Z),]
# covariates
V(knn_ct)$organ <- metadata_ct$organ
V(knn_ct)$age <- metadata_ct$age
V(knn_ct)$n_cells <- metadata_ct$n_cells
V(knn_ct)$method <- metadata_ct$method
V(knn_ct)$donor <- metadata_ct$donor
# top factors
for (c in colnames(Z)){
vertex_attr(knn_ct)[[c]] <- Z[,c]
}
return(knn_ct)
}
## Plot KNN graph
plot_ct_KNN_graph <- function(knn, color_by="organ"){
## Define color
if (!color_by %in% names(vertex_attr(knn))){
stop("specified color_by variable is not in vertex_attr(knn)")
}
if (color_by=="organ"){
scale_color_knngraph <- scale_color_manual(values=org_colors)
} else if (is.numeric(vertex_attr(knn, color_by))){
scale_color_knngraph <- scale_color_viridis_c(option="magma")
} else {
scale_color_knngraph <- scale_color_discrete()
}
vertex_attr(knn, "color_by") <- vertex_attr(knn, color_by)
ggraph(knn) +
geom_edge_link0() +
geom_node_point(aes(color=color_by, size=n_cells)) +
theme(panel.background = element_blank()) +
scale_color_knngraph +
scale_size(range=c(2,7))
}
get_top_factor_per_celltype(mofa_trained, "CD8+T", min_R2 = 5)
plot_ct_KNN_graph(get_ct_KNN_graph(mofa_trained, "SMALL PRE B CELL", k=5), color_by = 'organ') +
plot_ct_KNN_graph(get_ct_KNN_graph(mofa_trained, "SMALL PRE B CELL", k=5), color_by = 'Factor4')
all_groups <- names(get_data(mofa_trained)[[1]])
knn_graph_pl <- lapply(all_groups, function(g){
knn <- get_ct_KNN_graph(mofa_trained, g, k=5, min_R2 = 2)
plot_ct_KNN_graph(knn, color_by = 'organ') + ggtitle(g)
})
knn_graph_pl <- setNames(knn_graph_pl, all_groups)
knn_graph_pl$TREG
## Score connectivity between samples from the same organ
.calc_connectivity_score <- function(knn, o){
adj <- get.adjacency(knn)
n_org <- sum(V(knn)$organ==o)
n_other <- sum(V(knn)$organ!=o)
within_edges <- sum(adj[V(knn)$organ==o,V(knn)$organ==o])
between_edges <- sum(adj[V(knn)$organ==o,V(knn)$organ!=o])
score <- (within_edges/between_edges)*(n_other/n_org)
return(score)
}
## Calculate connectivity score for permutations of node labels
conn_score_test <- function(knn, o, n_perm=1000){
real_score <- .calc_connectivity_score(knn, o)
## Random permutations
rand_scores <- c()
for (i in 1:n_perm){
rand_knn <- knn
V(rand_knn)$organ <- sample(V(knn)$organ)
rand_scores <- c(rand_scores, .calc_connectivity_score(rand_knn, o))
}
p_val <- sum(c(rand_scores, real_score) >= real_score)/(n_perm + 1)
if (p_val < 2e-16){ p_val <- 2e-16}
return(c('score'=real_score,'p_value'=p_val))
}
## Calculate connectivity score + significance with permutation test
test_conn_group <- function(mofa_trained, g, k=5, min_R2 = 2, n_perm=1000){
knn <- get_ct_KNN_graph(mofa_trained, g, k=k, min_R2 = min_R2)
test_orgs <- names(table(V(knn)$organ))[table(V(knn)$organ) > 2]
return(sapply(test_orgs, function(o) conn_score_test(knn, o, n_perm=n_perm)))
}
connectivity_test_ls <- lapply(all_groups, function(g) test_conn_group(mofa_trained, g))
connectivity_test_ls <- setNames(connectivity_test_ls, all_groups)
connectivity_test_df <- imap(connectivity_test_ls, ~ data.frame(t(.x)) %>% rownames_to_column("organ") %>% mutate(group=.y)) %>%
purrr::reduce(bind_rows) %>%
mutate(is_signif = ifelse(p_value < 0.01, TRUE, FALSE))
connectivity_test_df %>%
ggplot(aes(organ, group,fill=log10(score))) +
geom_tile() +
scale_fill_distiller(palette="Reds", direction = 1) +
geom_text(data=. %>% filter(is_signif), label="*", size=5)
connectivity_test_df %>%
group_by(group) %>%
mutate(mean_val=median(score)) %>%
ungroup() %>%
arrange(-mean_val) %>%
mutate(group=factor(group, levels=unique(group))) %>%
ggplot(aes(organ, log1p(score))) +
geom_col(fill="grey") +
geom_col(data=. %>% filter(is_signif), aes(fill=organ)) +
scale_fill_manual(values=org_colors) +
coord_flip() +
facet_grid(group~.) +
theme(strip.text.y = element_text(angle=0))
get_top_weight_genes <- function(mofa_trained, f, n_top=20, which="top"){
w_df <- get_weights(mofa_trained, factors = f, as.data.frame = TRUE) %>%
arrange(value)
if (which=="top") {
w_df %>%
top_n(n_top, value) %>%
pull(feature) %>%
as.character()
} else if (which=="bottom"){
w_df %>%
top_n(n_top, -value) %>%
pull(feature) %>%
as.character()
}
}
plot_data_top_weights <- function(mofa_trained, ct, f, n_top=20, which="top"){
genes <- get_top_weight_genes(mofa_trained, f, which=which, n_top=n_top)
data <- get_data(mofa_trained, groups=ct)[[1]][[1]][genes,]
pl_df <- reshape2::melt(data, varnames=c("gene", "sample")) %>%
left_join(samples_metadata(mofa_trained)) %>%
arrange(age) %>%
mutate(sample=factor(sample, levels=unique(sample))) %>%
group_by(gene) %>%
mutate(value=scale(value))
pl_df %>%
ggplot(aes(sample, gene, fill=value)) +
geom_tile() +
facet_grid(.~organ, space="free", scales="free") +
scale_fill_gradient2(high="red", low="blue", name="Scaled\nexpression") +
xlab("----age--->") + ylab(glue("{which} weight genes")) +
theme_bw(base_size=16) +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
ggtitle(glue('{ct} - {f}'))
}
for (g in all_groups){
fs <- get_top_factor_per_celltype(mofa_trained, g, min_R2=3)
top_plots <- lapply(fs, function(x) (plot_data_top_weights(mofa_trained, g, x, which="top") + remove_x_axis()) /
plot_data_top_weights(mofa_trained, g, x, which="bottom") + ggtitle("")
)
wrap_plots(top_plots, ncol=1) +
ggsave(glue("{figdir}/top_factors_expr_{g}.pdf"), width=8, height = 7*length(top_plots))
}
plot_data_heatmap(mofa_trained, factor = 2, show_colnames=FALSE, annotation_samples = c("anno_lvl_2_final_clean", "organ"))
plot_factor(mofa_trained, factor=25, color_by="method", dot_size = 4)
# BiocManager::install("MOFAdata")
library(MOFAdata)
utils::data(reactomeGS)
head(rownames(reactomeGS))
## Remove row with NA
reactomeGS <- reactomeGS[!is.na(rownames(reactomeGS)),]
library(EnsDb.Hsapiens.v86)
hg.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
all_genes <- ensembldb::genes(EnsDb.Hsapiens.v86)
detach(package:EnsDb.Hsapiens.v86)
detach(package:ensembldb)
# gene_name_2_id <- function(gene){
# return(all_genes[all_genes$gene_name==gene,]$gene_id[1])
# }
#
# gene_ids <- sapply(mofa_trained@features_metadata$feature, gene_name_2_id)
# rowData(sce)["gene_id"] <- gene_ids
# rowData(sce)["gene_name"] <- rownames(sce)
gene_names_reactome <- all_genes[colnames(reactomeGS)]$gene_name
colnames(reactomeGS) <- gene_names_reactome
Subset to genes tested
reactomeGS_universe <- reactomeGS[, colnames(reactomeGS) %in% mofa_trained@features_metadata$feature]
# GSEA on positive weights, with default options
res.positive <- run_enrichment(mofa_trained,
view='scaled_logcounts',
# statistical.test = 'cor.adj.parametric',
feature.sets = reactomeGS_universe,
sign = "positive",
)
# GSEA on negative weights, with default options
res.negative <- run_enrichment(mofa_trained,
view='scaled_logcounts',
# statistical.test = 'cor.adj.parametric',
feature.sets = reactomeGS_universe,
sign = "negative"
)
for (f in 1:mofa_trained@dimensions$K){
if (min(res.positive$pval.adj[,paste0("Factor", f)]) < 0.1) {
print(plot_enrichment(res.positive, factor = f, alpha=0.1) + ggtitle("Positive weights") +
plot_enrichment(res.negative, factor = f, alpha=0.1) + ggtitle("Negative weights") +
plot_annotation(title=paste0("Factor", f)))
}
}
signif_pathways <- rownames(data.frame(res.negative$pval.adj))[order(data.frame(res.negative$pval.adj)[["Factor8"]])[0:10]]
colnames(reactomeGS_universe)[reactomeGS_universe[signif_pathways[5],]==1]
plot_enrichment_detailed(res.negative, factor = 8)
–> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –>
–> –> –> –> –>
–> –> –>